See https://jennybc.github.io/purrr-tutorial/ls00_inspect-explore.html . Note that I installed purrr, repurrrsive (has examples) and listviewer.
str is a good choice, especially when used with max.level and list.len.
library(purrr)
library(repurrrsive)
str(wesanderson)
## List of 15
## $ GrandBudapest : chr [1:4] "#F1BB7B" "#FD6467" "#5B1A18" "#D67236"
## $ Moonrise1 : chr [1:4] "#F3DF6C" "#CEAB07" "#D5D5D3" "#24281A"
## $ Royal1 : chr [1:4] "#899DA4" "#C93312" "#FAEFD1" "#DC863B"
## $ Moonrise2 : chr [1:4] "#798E87" "#C27D38" "#CCC591" "#29211F"
## $ Cavalcanti : chr [1:5] "#D8B70A" "#02401B" "#A2A475" "#81A88D" ...
## $ Royal2 : chr [1:5] "#9A8822" "#F5CDB4" "#F8AFA8" "#FDDDA0" ...
## $ GrandBudapest2: chr [1:4] "#E6A0C4" "#C6CDF7" "#D8A499" "#7294D4"
## $ Moonrise3 : chr [1:5] "#85D4E3" "#F4B5BD" "#9C964A" "#CDC08C" ...
## $ Chevalier : chr [1:4] "#446455" "#FDD262" "#D3DDDC" "#C7B19C"
## $ Zissou : chr [1:5] "#3B9AB2" "#78B7C5" "#EBCC2A" "#E1AF00" ...
## $ FantasticFox : chr [1:5] "#DD8D29" "#E2D200" "#46ACC8" "#E58601" ...
## $ Darjeeling : chr [1:5] "#FF0000" "#00A08A" "#F2AD00" "#F98400" ...
## $ Rushmore : chr [1:5] "#E1BD6D" "#EABE94" "#0B775E" "#35274A" ...
## $ BottleRocket : chr [1:7] "#A42820" "#5F5647" "#9B110E" "#3F5151" ...
## $ Darjeeling2 : chr [1:5] "#ECCBAE" "#046C9A" "#D69C4E" "#ABDDDE" ...
or
listviewer::jsonedit(wesanderson, mode="view")
OK, let’s look at Game of Thrones characters (GOT)
listviewer::jsonedit(got_chars, mode='view')
str(got_chars, list.len=3)
## List of 29
## $ :List of 18
## ..$ url : chr "http://www.anapioficeandfire.com/api/characters/1022"
## ..$ id : int 1022
## ..$ name : chr "Theon Greyjoy"
## .. [list output truncated]
## $ :List of 18
## ..$ url : chr "http://www.anapioficeandfire.com/api/characters/1052"
## ..$ id : int 1052
## ..$ name : chr "Tyrion Lannister"
## .. [list output truncated]
## $ :List of 18
## ..$ url : chr "http://www.anapioficeandfire.com/api/characters/1074"
## ..$ id : int 1074
## ..$ name : chr "Victarion Greyjoy"
## .. [list output truncated]
## [list output truncated]
Let’s look more inside.
str(got_chars[[1]], list.len=8)
## List of 18
## $ url : chr "http://www.anapioficeandfire.com/api/characters/1022"
## $ id : int 1022
## $ name : chr "Theon Greyjoy"
## $ gender : chr "Male"
## $ culture : chr "Ironborn"
## $ born : chr "In 278 AC or 279 AC, at Pyke"
## $ died : chr ""
## $ alive : logi TRUE
## [list output truncated]
Git hub users…
str(gh_users, max.level = 1)
## List of 6
## $ :List of 30
## $ :List of 30
## $ :List of 30
## $ :List of 30
## $ :List of 30
## $ :List of 30
Remember vectorized lists…
(3:5)^2
## [1] 9 16 25
Closest cousin is lapply
map(c(9, 16, 25), sqrt)
## [[1]]
## [1] 3
##
## [[2]]
## [1] 4
##
## [[3]]
## [1] 5
Template for map functions,
map(Your data, your function)
Practice on game of thrones data.
Pull out an element by its name. Like function(x) x[["TEXT"]]
map(got_chars[1:4], "name")
## [[1]]
## [1] "Theon Greyjoy"
##
## [[2]]
## [1] "Tyrion Lannister"
##
## [[3]]
## [1] "Victarion Greyjoy"
##
## [[4]]
## [1] "Will"
If you give a number, it will pull out that position. Like function(x) x[[i]].
map(got_chars[5:8], 3)
## [[1]]
## [1] "Areo Hotah"
##
## [[2]]
## [1] "Chett"
##
## [[3]]
## [1] "Cressen"
##
## [[4]]
## [1] "Arianne Martell"
You can pipe to it
got_chars[1:4] %>%
map("name")
## [[1]]
## [1] "Theon Greyjoy"
##
## [[2]]
## [1] "Tyrion Lannister"
##
## [[3]]
## [1] "Victarion Greyjoy"
##
## [[4]]
## [1] "Will"
map will always return a list. Use the typed map... commands to get vectors.
map_chr(got_chars[1:4], "name")
## [1] "Theon Greyjoy" "Tyrion Lannister" "Victarion Greyjoy"
## [4] "Will"
map_chr(got_chars[1:3], 3)
## [1] "Theon Greyjoy" "Tyrion Lannister" "Victarion Greyjoy"
You can extract multiple values.
got_chars[[3]][c("name", "culture", "gender", "born")]
## $name
## [1] "Victarion Greyjoy"
##
## $culture
## [1] "Ironborn"
##
## $gender
## [1] "Male"
##
## $born
## [1] "In 268 AC or before, at Pyke"
Here’s a strange equivalent command
x <- map(got_chars, `[`, c("name", "culture", "gender", "born") )
str(x[16:17])
## List of 2
## $ :List of 4
## ..$ name : chr "Brandon Stark"
## ..$ culture: chr "Northmen"
## ..$ gender : chr "Male"
## ..$ born : chr "In 290 AC, at Winterfell"
## $ :List of 4
## ..$ name : chr "Brienne of Tarth"
## ..$ culture: chr ""
## ..$ gender : chr "Female"
## ..$ born : chr "In 280 AC"
Maybe better to use magrittr’s extract function
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
x <- map(got_chars, extract, c("name", "culture", "gender", "born"))
str(x[18:19])
## List of 2
## $ :List of 4
## ..$ name : chr "Catelyn Stark"
## ..$ culture: chr "Rivermen"
## ..$ gender : chr "Female"
## ..$ born : chr "In 264 AC, at Riverrun"
## $ :List of 4
## ..$ name : chr "Cersei Lannister"
## ..$ culture: chr "Westerman"
## ..$ gender : chr "Female"
## ..$ born : chr "In 266 AC, at Casterly Rock"
Make a data frame!! Note the correct columnsc
map_df(got_chars, extract, c("name", "culture", "gender", "born"))
## Warning in bind_rows_(x, .id): '.Random.seed' is not an integer vector but
## of type 'NULL', so ignored
## # A tibble: 29 × 4
## name culture gender
## <chr> <chr> <chr>
## 1 Theon Greyjoy Ironborn Male
## 2 Tyrion Lannister Male
## 3 Victarion Greyjoy Ironborn Male
## 4 Will Male
## 5 Areo Hotah Norvoshi Male
## 6 Chett Male
## 7 Cressen Male
## 8 Arianne Martell Dornish Female
## 9 Daenerys Targaryen Valyrian Female
## 10 Davos Seaworth Westeros Male
## # ... with 19 more rows, and 1 more variables: born <chr>
This is a little dangerous. Columns may not come out right. Better to be explicit..
library(tibble)
got_chars %>% {
tibble(
name = map_chr(., "name"),
culture = map_chr(., "culture"),
gender = map_chr(., "gender"),
id = map_int(., "id"),
born = map_chr(., "born"),
alive = map_lgl(., "alive")
)
}
## # A tibble: 29 × 6
## name culture gender id
## <chr> <chr> <chr> <int>
## 1 Theon Greyjoy Ironborn Male 1022
## 2 Tyrion Lannister Male 1052
## 3 Victarion Greyjoy Ironborn Male 1074
## 4 Will Male 1109
## 5 Areo Hotah Norvoshi Male 1166
## 6 Chett Male 1267
## 7 Cressen Male 1295
## 8 Arianne Martell Dornish Female 130
## 9 Daenerys Targaryen Valyrian Female 1303
## 10 Davos Seaworth Westeros Male 1319
## # ... with 19 more rows, and 2 more variables: born <chr>, alive <lgl>
library(jsonlite)
##
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
##
## flatten
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:purrr':
##
## contains, order_by
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tibble)
gh_users provides info about 6 github users. It is a recursive list. Let’s try to figure out what is in the list.
str(gh_users, max.level=1)
## List of 6
## $ :List of 30
## $ :List of 30
## $ :List of 30
## $ :List of 30
## $ :List of 30
## $ :List of 30
str(gh_users[[1]], list.len=6)
## List of 30
## $ login : chr "gaborcsardi"
## $ id : int 660288
## $ avatar_url : chr "https://avatars.githubusercontent.com/u/660288?v=3"
## $ gravatar_id : chr ""
## $ url : chr "https://api.github.com/users/gaborcsardi"
## $ html_url : chr "https://github.com/gaborcsardi"
## [list output truncated]
listviewer::jsonedit(gh_users, mode='view')
Remember the usage of purrr::map, map(.x, .f, ...) … first is the list, second is the function. Remember the shortcut based on name.
map(gh_users, "login")
## [[1]]
## [1] "gaborcsardi"
##
## [[2]]
## [1] "jennybc"
##
## [[3]]
## [1] "jtleek"
##
## [[4]]
## [1] "juliasilge"
##
## [[5]]
## [1] "leeper"
##
## [[6]]
## [1] "masalmon"
Look at the 18th element
map(gh_users, 18)
## [[1]]
## [1] "Gábor Csárdi"
##
## [[2]]
## [1] "Jennifer (Jenny) Bryan"
##
## [[3]]
## [1] "Jeff L."
##
## [[4]]
## [1] "Julia Silge"
##
## [[5]]
## [1] "Thomas J. Leeper"
##
## [[6]]
## [1] "Maëlle Salmon"
If you provide a vector, each index pulls out something at the next level. For example,
gh_repos %>% map_chr(c(1,3))
## [1] "gaborcsardi/after" "jennybc/2013-11_sfu" "jtleek/advdatasci"
## [4] "juliasilge/2016-14" "leeper/ampolcourse" "masalmon/aqi_pdf"
This gives the full name (element 3) of the first repository of each user (note this is gh_repos, not gh_users).
listviewer::jsonedit(gh_repos, mode='view')
Let’s now make a dataframe with one row per repository. Use tibble::enframe
(unames <- map_chr(gh_repos, c(1,4,1))) # 1,4,1 -> first repo, owner info, login
## [1] "gaborcsardi" "jennybc" "jtleek" "juliasilge" "leeper"
## [6] "masalmon"
udf <- gh_repos %>%
set_names(unames) %>%
enframe("username", "gh_repos")
glimpse(udf)
## Observations: 6
## Variables: 2
## $ username <chr> "gaborcsardi", "jennybc", "jtleek", "juliasilge", "le...
## $ gh_repos <list> [[[61160198, "after", "gaborcsardi/after", ["gaborcs...
For some reason, the notebook gives an error when it tries to print
Let’s see what we can do. How many repos are there per user?
udf %>% mutate(n_repos = map_int(gh_repos, length)) %>% glimpse
## Observations: 6
## Variables: 3
## $ username <chr> "gaborcsardi", "jennybc", "jtleek", "juliasilge", "le...
## $ gh_repos <list> [[[61160198, "after", "gaborcsardi/after", ["gaborcs...
## $ n_repos <int> 30, 30, 30, 26, 30, 30
What is a strategy for doing something compliated? First, try on first element
one_user <- udf$gh_repos[[1]] # First user
one_repo <- one_user[[1]] # First repo
str(one_repo, max.level=1, list.len=5) # Info about the repo
## List of 68
## $ id : int 61160198
## $ name : chr "after"
## $ full_name : chr "gaborcsardi/after"
## $ owner :List of 17
## .. [list output truncated]
## $ private : logi FALSE
## [list output truncated]
Let’s get out something with base R
one_repo[c("name", "fork", "open_issues")]
## $name
## [1] "after"
##
## $fork
## [1] FALSE
##
## $open_issues
## [1] 0
Ok, now do it for more
map_df(one_user, extract, c("name", "fork", "open_issues") )
## # A tibble: 30 × 3
## name fork open_issues
## <chr> <lgl> <int>
## 1 after FALSE 0
## 2 argufy FALSE 6
## 3 ask FALSE 4
## 4 baseimports FALSE 0
## 5 citest TRUE 0
## 6 clisymbols FALSE 0
## 7 cmaker TRUE 0
## 8 cmark TRUE 0
## 9 conditions TRUE 0
## 10 crayon FALSE 7
## # ... with 20 more rows
Ok, now scale up for all users
udf %>% mutate(repo_info = gh_repos %>% map(. %>% map_df(extract, c("name", "fork", "open_issues")))) %>% glimpse
## Observations: 6
## Variables: 3
## $ username <chr> "gaborcsardi", "jennybc", "jtleek", "juliasilge", "l...
## $ gh_repos <list> [[[61160198, "after", "gaborcsardi/after", ["gaborc...
## $ repo_info <list> [<c("after", "argufy", "ask", "baseimports", "cites...
How do we remove all of the list columns?
(rdf <- udf %>%
mutate(
repo_info = gh_repos %>%
map(. %>% map_df(`[`, c("name", "fork", "open_issues")))
) %>%
select(-gh_repos) %>%
tidyr::unnest())
## # A tibble: 176 × 4
## username name fork open_issues
## <chr> <chr> <lgl> <int>
## 1 gaborcsardi after FALSE 0
## 2 gaborcsardi argufy FALSE 6
## 3 gaborcsardi ask FALSE 4
## 4 gaborcsardi baseimports FALSE 0
## 5 gaborcsardi citest TRUE 0
## 6 gaborcsardi clisymbols FALSE 0
## 7 gaborcsardi cmaker TRUE 0
## 8 gaborcsardi cmark TRUE 0
## 9 gaborcsardi conditions TRUE 0
## 10 gaborcsardi crayon FALSE 7
## # ... with 166 more rows
Let’s drop forks and show for each user the three repositories with the most open issues.
rdf %>% filter(!fork) %>% select(-fork) %>% group_by(username) %>% arrange(username, desc(open_issues)) %>%
slice(1:3)
## Source: local data frame [18 x 3]
## Groups: username [6]
##
## username name open_issues
## <chr> <chr> <int>
## 1 gaborcsardi gh 8
## 2 gaborcsardi crayon 7
## 3 gaborcsardi argufy 6
## 4 jennybc 2014-01-27-miami 4
## 5 jennybc bingo 3
## 6 jennybc candy 2
## 7 jtleek datasharing 399
## 8 jtleek dataanalysis 5
## 9 jtleek genstats 3
## 10 juliasilge tidytext 5
## 11 juliasilge choroplethrUTCensusTract 0
## 12 juliasilge CountyHealthApp 0
## 13 leeper crandatapkgs 12
## 14 leeper csvy 2
## 15 leeper ciplotm 1
## 16 masalmon cpcb 5
## 17 masalmon rtimicropem 5
## 18 masalmon laads 4
Note that map can take a forumla as the function. This is unusual and a nice way to indicate an anonymous function.
Let’s work with Game of Thrones again. Pull aliases
aliases <- set_names(map(got_chars, "aliases"), map_chr(got_chars, "name"))
(aliases <- aliases[c("Theon Greyjoy", "Asha Greyjoy", "Brienne of Tarth")])
## $`Theon Greyjoy`
## [1] "Prince of Fools" "Theon Turncloak" "Reek" "Theon Kinslayer"
##
## $`Asha Greyjoy`
## [1] "Esgred" "The Kraken's Daughter"
##
## $`Brienne of Tarth`
## [1] "The Maid of Tarth" "Brienne the Beauty" "Brienne the Blue"
Use an existing function
my_fun <- function(x) paste(x, collapse=" | ")
map(aliases, my_fun)
## $`Theon Greyjoy`
## [1] "Prince of Fools | Theon Turncloak | Reek | Theon Kinslayer"
##
## $`Asha Greyjoy`
## [1] "Esgred | The Kraken's Daughter"
##
## $`Brienne of Tarth`
## [1] "The Maid of Tarth | Brienne the Beauty | Brienne the Blue"
Anonymous, conventional
map(aliases, function(x) paste(x, collapse = " | "))
## $`Theon Greyjoy`
## [1] "Prince of Fools | Theon Turncloak | Reek | Theon Kinslayer"
##
## $`Asha Greyjoy`
## [1] "Esgred | The Kraken's Daughter"
##
## $`Brienne of Tarth`
## [1] "The Maid of Tarth | Brienne the Beauty | Brienne the Blue"
or
map(aliases, paste, collapse = " | ")
## $`Theon Greyjoy`
## [1] "Prince of Fools | Theon Turncloak | Reek | Theon Kinslayer"
##
## $`Asha Greyjoy`
## [1] "Esgred | The Kraken's Daughter"
##
## $`Brienne of Tarth`
## [1] "The Maid of Tarth | Brienne the Beauty | Brienne the Blue"
Anonymous function with formula… Start with ~ and use .x for the input (e.g. an element)
map(aliases, ~ paste(.x, collapse = " | "))
## $`Theon Greyjoy`
## [1] "Prince of Fools | Theon Turncloak | Reek | Theon Kinslayer"
##
## $`Asha Greyjoy`
## [1] "Esgred | The Kraken's Daughter"
##
## $`Brienne of Tarth`
## [1] "The Maid of Tarth | Brienne the Beauty | Brienne the Blue"
The workflow is to try on a single element and then expand. For example,
(a <- map(got_chars, "aliases")[[19]]) # Oops - it's empty
## list()
(a <- map(got_chars, "aliases")[[16]]) # Better!
## [1] "Bran" "Bran the Broken" "The Winged Wolf"
paste(a, sep = " | ") # Oops - not what I want
## [1] "Bran" "Bran the Broken" "The Winged Wolf"
paste(a, collapse = " | ") # OK
## [1] "Bran | Bran the Broken | The Winged Wolf"
got_chars[15:17] %>%
map("aliases") %>%
map_chr(paste, collapse = " | ") # YES!!
## [1] "Varamyr Sixskins | Haggon | Lump"
## [2] "Bran | Bran the Broken | The Winged Wolf"
## [3] "The Maid of Tarth | Brienne the Beauty | Brienne the Blue"
Do a dataframe instead. Try enframe again.
aliases <- set_names(map(got_chars, "aliases"), map_chr(got_chars, "name"))
map_chr(aliases[c(3, 10, 20, 24)], ~ paste(.x, collapse = " | ")) %>%
tibble::enframe(value="aliases")
## # A tibble: 4 × 2
## name
## <chr>
## 1 Victarion Greyjoy
## 2 Davos Seaworth
## 3 Eddard Stark
## 4 Aeron Greyjoy
## # ... with 1 more variables: aliases <chr>
Or, another way
tibble::tibble(
name = map_chr(got_chars, "name"),
aliases = got_chars %>%
map("aliases") %>%
map_chr(~ paste(.x, collapse = " | "))
) %>%
dplyr::slice(c(3, 10, 20, 24))
## # A tibble: 4 × 2
## name
## <chr>
## 1 Victarion Greyjoy
## 2 Davos Seaworth
## 3 Eddard Stark
## 4 Aeron Greyjoy
## # ... with 1 more variables: aliases <chr>
This is a typical workflow. Take a hard to understand list and pull out what you want into a easy to understand dataframe.
What if you want to map over two vectors or lists in parallel
map2(.x, .y, .f, ...) # and map2_chr, map2_lgl
Let’s form name and birthdate for the characters
nms <- got_chars %>% map_chr("name")
birth <- got_chars %>% map_chr("born")
my_fun = function(x, y) paste(x, "was born", y)
map2_chr(nms, birth, my_fun) %>% head
## [1] "Theon Greyjoy was born In 278 AC or 279 AC, at Pyke"
## [2] "Tyrion Lannister was born In 273 AC, at Casterly Rock"
## [3] "Victarion Greyjoy was born In 268 AC or before, at Pyke"
## [4] "Will was born "
## [5] "Areo Hotah was born In 257 AC or before, at Norvos"
## [6] "Chett was born At Hag's Mire"
Other ways…
map2_chr(nms, birth, function(x, y) paste(x, "was born", y)) %>% head
## [1] "Theon Greyjoy was born In 278 AC or 279 AC, at Pyke"
## [2] "Tyrion Lannister was born In 273 AC, at Casterly Rock"
## [3] "Victarion Greyjoy was born In 268 AC or before, at Pyke"
## [4] "Will was born "
## [5] "Areo Hotah was born In 257 AC or before, at Norvos"
## [6] "Chett was born At Hag's Mire"
map2_chr(nms, birth, ~ paste(.x, "was born", .y)) %>% head
## [1] "Theon Greyjoy was born In 278 AC or 279 AC, at Pyke"
## [2] "Tyrion Lannister was born In 273 AC, at Casterly Rock"
## [3] "Victarion Greyjoy was born In 268 AC or before, at Pyke"
## [4] "Will was born "
## [5] "Areo Hotah was born In 257 AC or before, at Norvos"
## [6] "Chett was born At Hag's Mire"
If you need more than two parallel vectors or lists, then use pmap and friends. Input is a list of lists
pmap(.l, .f, ...)
got_chars %>% {
tibble::tibble(
name = map_chr(., "name"),
aliases = map(., "aliases"),
allegiances = map(., "allegiances")
)
} -> df
glimpse(df)
## Observations: 29
## Variables: 3
## $ name <chr> "Theon Greyjoy", "Tyrion Lannister", "Victarion Gr...
## $ aliases <list> [<"Prince of Fools", "Theon Turncloak", "Reek", "...
## $ allegiances <list> ["House Greyjoy of Pyke", "House Lannister of Cas...
my_fun <- function(name, aliases, allegiances) {
paste(name, "has", length(aliases), "aliases and", length(allegiances), "allegiances")
}
df %>%
pmap_chr(my_fun) %>% tail
## [1] "Aeron Greyjoy has 2 aliases and 1 allegiances"
## [2] "Kevan Lannister has 0 aliases and 1 allegiances"
## [3] "Melisandre has 5 aliases and 0 allegiances"
## [4] "Merrett Frey has 1 aliases and 1 allegiances"
## [5] "Quentyn Martell has 4 aliases and 1 allegiances"
## [6] "Sansa Stark has 3 aliases and 2 allegiances"
Here’s a nice way to store data
load(url("http://varianceexplained.org/files/trump_tweets_df.rda"))
glimpse(trump_tweets_df)
## Observations: 1,512
## Variables: 16
## $ text <chr> "My economic policy speech will be carried live ...
## $ favorited <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,...
## $ favoriteCount <dbl> 9214, 6981, 15724, 19837, 34051, 29831, 19223, 1...
## $ replyToSN <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ created <dttm> 2016-08-08 15:20:44, 2016-08-08 13:28:20, 2016-...
## $ truncated <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,...
## $ replyToSID <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ id <chr> "762669882571980801", "762641595439190016", "762...
## $ replyToUID <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ statusSource <chr> "<a href=\"http://twitter.com/download/android\"...
## $ screenName <chr> "realDonaldTrump", "realDonaldTrump", "realDonal...
## $ retweetCount <dbl> 3107, 2390, 6691, 6402, 11717, 9892, 5784, 7930,...
## $ isRetweet <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,...
## $ retweeted <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,...
## $ longitude <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ latitude <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
tweets <- trump_tweets_df$text
tweets %>% head() %>% strtrim(70)
## [1] "My economic policy speech will be carried live at 12:15 P.M. Enjoy!"
## [2] "Join me in Fayetteville, North Carolina tomorrow evening at 6pm. Ticke"
## [3] "#ICYMI: \"Will Media Apologize to Trump?\" https://t.co/ia7rKBmioA"
## [4] "Michael Morell, the lightweight former Acting Director of C.I.A., and "
## [5] "The media is going crazy. They totally distort so many things on purpo"
## [6] "I see where Mayor Stephanie Rawlings-Blake of Baltimore is pushing Cro"
Words that are associated with his Android device
regex <- "badly|crazy|weak|spent|strong|dumb|joke|guns|funny|dead"
Let’s extract these words from the tweets we have while using purrr
Let’s scale down the problem first
tw <- tweets[c(1, 2, 5, 6, 198, 347, 919)]
tw %>% strtrim(70)
## [1] "My economic policy speech will be carried live at 12:15 P.M. Enjoy!"
## [2] "Join me in Fayetteville, North Carolina tomorrow evening at 6pm. Ticke"
## [3] "The media is going crazy. They totally distort so many things on purpo"
## [4] "I see where Mayor Stephanie Rawlings-Blake of Baltimore is pushing Cro"
## [5] "Bernie Sanders started off strong, but with the selection of Kaine for"
## [6] "Crooked Hillary Clinton is unfit to serve as President of the U.S. Her"
## [7] "The Cruz-Kasich pact is under great strain. This joke of a deal is fal"
Let’s use gregexpr because it returns a crazy list
matches <- gregexpr(regex, tw)
str(matches)
## List of 7
## $ : atomic [1:1] -1
## ..- attr(*, "match.length")= int -1
## ..- attr(*, "useBytes")= logi TRUE
## $ : atomic [1:1] -1
## ..- attr(*, "match.length")= int -1
## ..- attr(*, "useBytes")= logi TRUE
## $ : atomic [1:1] 20
## ..- attr(*, "match.length")= int 5
## ..- attr(*, "useBytes")= logi TRUE
## $ : atomic [1:1] 134
## ..- attr(*, "match.length")= int 4
## ..- attr(*, "useBytes")= logi TRUE
## $ : atomic [1:2] 28 95
## ..- attr(*, "match.length")= int [1:2] 6 4
## ..- attr(*, "useBytes")= logi TRUE
## $ : atomic [1:2] 87 114
## ..- attr(*, "match.length")= int [1:2] 4 6
## ..- attr(*, "useBytes")= logi TRUE
## $ : atomic [1:3] 50 112 123
## ..- attr(*, "match.length")= int [1:3] 4 4 4
## ..- attr(*, "useBytes")= logi TRUE
matches[[7]]
## [1] 50 112 123
## attr(,"match.length")
## [1] 4 4 4
## attr(,"useBytes")
## [1] TRUE
Whaaa? matches is * A list. One element per element in tw * Each element is an integer vector * -1 if no matches found * The position(s) of the first character of each match, otherwise * Each element has two attributes. Look at match.length * -1 if no matches * Length of each match, otherwise
We can get out the words, but it won’t be fun.
We’re going to use substring eventually. It wants,
substring(text, first, last)
Each tweet can be the text. There’s enough match data to figure out where the match in the text starts (e.g. first). Getting last will be harder.
… more to do here…
library(gapminder)
library(broom)
library(ggplot2)
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:magrittr':
##
## extract
Play with gapminder
gapminder %>%
ggplot(aes(year, lifeExp, group=country)) + geom_line(alpha = 1/3)
Let’s fit each country
gapminder %>%
ggplot(aes(year, lifeExp, group=country)) +
geom_line(stat="smooth", method="lm", alpha=1/3, se=FALSE, colour="black")
If we want to see the fits themselves, we need to do it ourselves
gap_nested <- gapminder %>%
group_by(country) %>% nest()
gap_nested
## # A tibble: 142 × 2
## country data
## <fctr> <list>
## 1 Afghanistan <tibble [12 × 5]>
## 2 Albania <tibble [12 × 5]>
## 3 Algeria <tibble [12 × 5]>
## 4 Angola <tibble [12 × 5]>
## 5 Argentina <tibble [12 × 5]>
## 6 Australia <tibble [12 × 5]>
## 7 Austria <tibble [12 × 5]>
## 8 Bahrain <tibble [12 × 5]>
## 9 Bangladesh <tibble [12 × 5]>
## 10 Belgium <tibble [12 × 5]>
## # ... with 132 more rows
gap_nested$data[[1]]
## # A tibble: 12 × 5
## continent year lifeExp pop gdpPercap
## <fctr> <int> <dbl> <int> <dbl>
## 1 Asia 1952 28.801 8425333 779.4453
## 2 Asia 1957 30.332 9240934 820.8530
## 3 Asia 1962 31.997 10267083 853.1007
## 4 Asia 1967 34.020 11537966 836.1971
## 5 Asia 1972 36.088 13079460 739.9811
## 6 Asia 1977 38.438 14880372 786.1134
## 7 Asia 1982 39.854 12881816 978.0114
## 8 Asia 1987 40.822 13867957 852.3959
## 9 Asia 1992 41.674 16317921 649.3414
## 10 Asia 1997 41.763 22227415 635.3414
## 11 Asia 2002 42.129 25268405 726.7341
## 12 Asia 2007 43.828 31889923 974.5803
Note that next puts all of the data about that country into its own data frame.
Now we can do fits!
gap_fits <- gap_nested %>% mutate(fit = map(data, ~lm(lifeExp ~ year, data=.x)))
Let’s look at one model
gap_fits %>% tail(3)
## # A tibble: 3 × 3
## country data fit
## <fctr> <list> <list>
## 1 Yemen, Rep. <tibble [12 × 5]> <S3: lm>
## 2 Zambia <tibble [12 × 5]> <S3: lm>
## 3 Zimbabwe <tibble [12 × 5]> <S3: lm>
canada <- which(gap_fits$country == "Canada")
summary(gap_fits$fit[[canada]])
##
## Call:
## lm(formula = lifeExp ~ year, data = .x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.3812 -0.1368 -0.0471 0.2481 0.3157
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.583e+02 8.252e+00 -43.42 1.01e-12 ***
## year 2.189e-01 4.169e-03 52.50 1.52e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2492 on 10 degrees of freedom
## Multiple R-squared: 0.9964, Adjusted R-squared: 0.996
## F-statistic: 2757 on 1 and 10 DF, p-value: 1.521e-13
Let’s pull out all the \(r^2\) values.
gap_fits %>% mutate(rsq = map_dbl(fit, ~ summary(.x)[["r.squared"]])) %>%
arrange(rsq)
## # A tibble: 142 × 4
## country data fit rsq
## <fctr> <list> <list> <dbl>
## 1 Rwanda <tibble [12 × 5]> <S3: lm> 0.01715964
## 2 Botswana <tibble [12 × 5]> <S3: lm> 0.03402340
## 3 Zimbabwe <tibble [12 × 5]> <S3: lm> 0.05623196
## 4 Zambia <tibble [12 × 5]> <S3: lm> 0.05983644
## 5 Swaziland <tibble [12 × 5]> <S3: lm> 0.06821087
## 6 Lesotho <tibble [12 × 5]> <S3: lm> 0.08485635
## 7 Cote d'Ivoire <tibble [12 × 5]> <S3: lm> 0.28337240
## 8 South Africa <tibble [12 × 5]> <S3: lm> 0.31246865
## 9 Uganda <tibble [12 × 5]> <S3: lm> 0.34215382
## 10 Congo, Dem. Rep. <tibble [12 × 5]> <S3: lm> 0.34820278
## # ... with 132 more rows
Use broom to get the coefficient table
gap_fits %>%
mutate(coef = map(fit, broom::tidy))
## # A tibble: 142 × 4
## country data fit coef
## <fctr> <list> <list> <list>
## 1 Afghanistan <tibble [12 × 5]> <S3: lm> <data.frame [2 × 5]>
## 2 Albania <tibble [12 × 5]> <S3: lm> <data.frame [2 × 5]>
## 3 Algeria <tibble [12 × 5]> <S3: lm> <data.frame [2 × 5]>
## 4 Angola <tibble [12 × 5]> <S3: lm> <data.frame [2 × 5]>
## 5 Argentina <tibble [12 × 5]> <S3: lm> <data.frame [2 × 5]>
## 6 Australia <tibble [12 × 5]> <S3: lm> <data.frame [2 × 5]>
## 7 Austria <tibble [12 × 5]> <S3: lm> <data.frame [2 × 5]>
## 8 Bahrain <tibble [12 × 5]> <S3: lm> <data.frame [2 × 5]>
## 9 Bangladesh <tibble [12 × 5]> <S3: lm> <data.frame [2 × 5]>
## 10 Belgium <tibble [12 × 5]> <S3: lm> <data.frame [2 × 5]>
## # ... with 132 more rows
Hmm - let’s unnest coef
gap_fits %>%
mutate(coef = map(fit, broom::tidy)) %>%
unnest(coef)
## # A tibble: 284 × 6
## country term estimate std.error statistic
## <fctr> <chr> <dbl> <dbl> <dbl>
## 1 Afghanistan (Intercept) -507.5342716 40.484161954 -12.536613
## 2 Afghanistan year 0.2753287 0.020450934 13.462890
## 3 Albania (Intercept) -594.0725110 65.655359062 -9.048348
## 4 Albania year 0.3346832 0.033166387 10.091036
## 5 Algeria (Intercept) -1067.8590396 43.802200843 -24.379118
## 6 Algeria year 0.5692797 0.022127070 25.727749
## 7 Angola (Intercept) -376.5047531 46.583370599 -8.082385
## 8 Angola year 0.2093399 0.023532003 8.895964
## 9 Argentina (Intercept) -389.6063445 9.677729641 -40.258031
## 10 Argentina year 0.2317084 0.004888791 47.395847
## # ... with 274 more rows, and 1 more variables: p.value <dbl>